Input data
bike_share <- read_csv("bike_sharing_dataset.csv")
## Rows: 2922 Columns: 29── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (28): temp_avg, temp_min, temp_max, temp_observ, precip, wind, wt_fog, ...
## date (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(space_missions)
#View(bike_share)
EDA
total_cust = bike_share$total_cust
str(bike_share)
## spec_tbl_df [2,922 × 29] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : Date[1:2922], format: "2011-01-01" "2011-01-02" ...
## $ temp_avg : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ temp_min : num [1:2922] -1.57 0.88 -3.44 -5.96 -4.29 ...
## $ temp_max : num [1:2922] 11.97 13.81 7.46 4.64 6.11 ...
## $ temp_observ : num [1:2922] 2.77 7.33 -3.06 -3.1 -1.77 ...
## $ precip : num [1:2922] 0.0693 1.0373 1.8788 0 0 ...
## $ wind : num [1:2922] 2.58 3.92 3.62 1.8 2.95 ...
## $ wt_fog : num [1:2922] 1 1 NA NA NA NA 1 1 NA NA ...
## $ wt_heavy_fog : num [1:2922] NA 1 NA NA NA NA NA 1 NA NA ...
## $ wt_thunder : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ wt_sleet : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ wt_hail : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ wt_glaze : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ wt_haze : num [1:2922] 1 NA NA NA NA NA 1 1 NA NA ...
## $ wt_drift_snow : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ wt_high_wind : num [1:2922] NA NA NA NA NA NA NA NA 1 1 ...
## $ wt_mist : num [1:2922] 1 1 NA NA NA NA 1 1 NA NA ...
## $ wt_drizzle : num [1:2922] NA 1 NA NA NA NA NA NA NA NA ...
## $ wt_rain : num [1:2922] 1 1 NA NA NA NA NA NA NA NA ...
## $ wt_freeze_rain : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ wt_snow : num [1:2922] NA NA NA NA NA 1 1 1 NA NA ...
## $ wt_ground_fog : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ wt_ice_fog : num [1:2922] NA NA NA NA NA NA NA 1 NA NA ...
## $ wt_freeze_drizzle: num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ wt_unknown : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## $ casual : num [1:2922] 330 130 120 107 82 88 148 68 54 41 ...
## $ registered : num [1:2922] 629 651 1181 1429 1489 ...
## $ total_cust : num [1:2922] 959 781 1301 1536 1571 ...
## $ holiday : num [1:2922] NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_date(format = ""),
## .. temp_avg = col_double(),
## .. temp_min = col_double(),
## .. temp_max = col_double(),
## .. temp_observ = col_double(),
## .. precip = col_double(),
## .. wind = col_double(),
## .. wt_fog = col_double(),
## .. wt_heavy_fog = col_double(),
## .. wt_thunder = col_double(),
## .. wt_sleet = col_double(),
## .. wt_hail = col_double(),
## .. wt_glaze = col_double(),
## .. wt_haze = col_double(),
## .. wt_drift_snow = col_double(),
## .. wt_high_wind = col_double(),
## .. wt_mist = col_double(),
## .. wt_drizzle = col_double(),
## .. wt_rain = col_double(),
## .. wt_freeze_rain = col_double(),
## .. wt_snow = col_double(),
## .. wt_ground_fog = col_double(),
## .. wt_ice_fog = col_double(),
## .. wt_freeze_drizzle = col_double(),
## .. wt_unknown = col_double(),
## .. casual = col_double(),
## .. registered = col_double(),
## .. total_cust = col_double(),
## .. holiday = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(bike_share)
## date temp_avg temp_min temp_max
## Min. :2011-01-01 Min. :-12.100 Min. :-16.9937 Min. :-7.98
## 1st Qu.:2012-12-31 1st Qu.: 6.567 1st Qu.: 0.5165 1st Qu.:11.08
## Median :2014-12-31 Median : 15.433 Median : 8.5049 Median :19.99
## Mean :2014-12-31 Mean : 14.419 Mean : 8.5065 Mean :19.02
## 3rd Qu.:2016-12-30 3rd Qu.: 23.067 3rd Qu.: 17.3384 3rd Qu.:27.87
## Max. :2018-12-31 Max. : 31.733 Max. : 26.2063 Max. :37.85
## NA's :821
## temp_observ precip wind wt_fog
## Min. :-15.658 Min. : 0.00000 Min. : 0.375 Min. :1
## 1st Qu.: 3.013 1st Qu.: 0.00551 1st Qu.: 2.200 1st Qu.:1
## Median : 11.619 Median : 0.27150 Median : 2.900 Median :1
## Mean : 11.069 Mean : 3.43573 Mean : 3.163 Mean :1
## 3rd Qu.: 19.767 3rd Qu.: 2.88538 3rd Qu.: 3.875 3rd Qu.:1
## Max. : 28.667 Max. :118.78980 Max. :12.750 Max. :1
## NA's :1419
## wt_heavy_fog wt_thunder wt_sleet wt_hail wt_glaze
## Min. :1 Min. :1 Min. :1 Min. :1 Min. :1
## 1st Qu.:1 1st Qu.:1 1st Qu.:1 1st Qu.:1 1st Qu.:1
## Median :1 Median :1 Median :1 Median :1 Median :1
## Mean :1 Mean :1 Mean :1 Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1 Max. :1 Max. :1 Max. :1
## NA's :2714 NA's :2228 NA's :2793 NA's :2872 NA's :2769
## wt_haze wt_drift_snow wt_high_wind wt_mist wt_drizzle
## Min. :1 Min. :1 Min. :1 Min. :1 Min. :1
## 1st Qu.:1 1st Qu.:1 1st Qu.:1 1st Qu.:1 1st Qu.:1
## Median :1 Median :1 Median :1 Median :1 Median :1
## Mean :1 Mean :1 Mean :1 Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1 Max. :1 Max. :1 Max. :1
## NA's :2217 NA's :2915 NA's :2664 NA's :2551 NA's :2794
## wt_rain wt_freeze_rain wt_snow wt_ground_fog wt_ice_fog
## Min. :1 Min. :1 Min. :1 Min. :1 Min. :1
## 1st Qu.:1 1st Qu.:1 1st Qu.:1 1st Qu.:1 1st Qu.:1
## Median :1 Median :1 Median :1 Median :1 Median :1
## Mean :1 Mean :1 Mean :1 Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1 Max. :1 Max. :1 Max. :1
## NA's :2516 NA's :2917 NA's :2838 NA's :2886 NA's :2912
## wt_freeze_drizzle wt_unknown casual registered
## Min. :1 Min. :1 Min. : 2.0 Min. : 19
## 1st Qu.:1 1st Qu.:1 1st Qu.: 512.2 1st Qu.: 3839
## Median :1 Median :1 Median : 1220.5 Median : 5964
## Mean :1 Mean :1 Mean : 1679.8 Mean : 6046
## 3rd Qu.:1 3rd Qu.:1 3rd Qu.: 2357.2 3rd Qu.: 8188
## Max. :1 Max. :1 Max. :10173.0 Max. :15419
## NA's :2918 NA's :2921 NA's :4 NA's :4
## total_cust holiday
## Min. : 21 Min. :1
## 1st Qu.: 4628 1st Qu.:1
## Median : 7442 Median :1
## Mean : 7726 Mean :1
## 3rd Qu.:10850 3rd Qu.:1
## Max. :19113 Max. :1
## NA's :4 NA's :2833
na_index = list(which(is.na(bike_share$total_cust), arr.ind=TRUE))
na_index
## [[1]]
## [1] 1849 1850 1851 1852
There are 4 continous days that we have missing data in total_cust
tsplot(total_cust[1800:1900])
Thus, I will fill those values with middle values between values at indices 1848 and 1853.
temp = (total_cust[[1848]] - total_cust[[1853]] )
temp
## [1] 2457
bike_share$total_cust[1849] <- total_cust[[1848]] - (1/5)*temp
bike_share$total_cust[1850] <- total_cust[[1848]] - 2*(1/5)*temp
bike_share$total_cust[1851] <- total_cust[[1848]] - 3*(1/5)*temp
bike_share$total_cust[1852] <- total_cust[[1848]] - 4*(1/5)*temp
total_cust <- bike_share$total_cust
tsplot(total_cust[1800:1900])
which(is.na(bike_share$total_cust), arr.ind=TRUE)
## integer(0)
Now there is no more NA value in total_cust
ggplot( data = bike_share, aes( date, total_cust )) +
geom_line() +
ylab('Total Customers') +
xlab('Time') +
theme_minimal()
tsplot(bike_share$total_cust)
Use log version
log_total_cust <- log(bike_share$total_cust)
tsplot(log_total_cust)
ggplot( data = bike_share, aes( date, log_total_cust )) +
geom_line() +
ylab('Transformed Total Customers') +
xlab('Time') +
theme_minimal()
Non Parametric Trend
Smooth to with frequency = 7
cust_2week = ts(log_total_cust, freq=7)
tsplot(cust_2week, col=8, ylab = 'Total Customers') # the time scale matters (not shown)
lines(ksmooth(time(cust_2week), cust_2week, "normal", bandwidth=12), lwd=2, col=4)
Smooth to with frequency = 14
cust_2week = ts(log_total_cust, freq=14)
tsplot(cust_2week, col=8, ylab = 'Total Customers') # the time scale matters (not shown)
lines(ksmooth(time(cust_2week), cust_2week, "normal", bandwidth=12), lwd=2, col=4)
Smooth to with frequency = 28
cust_2week = ts(log_total_cust, freq=28)
tsplot(cust_2week, col=8, ylab = 'Total Customers') # the time scale matters (not shown)
lines(ksmooth(time(cust_2week), cust_2week, "normal", bandwidth=12), lwd=2, col=4)
Decomposition
sub1_ts <- ts(log_total_cust, start=c(2011,1), end=c(2018,365), frequency=365)
cust.dec = decompose(sub1_ts)
plot(cust.dec)
sub1_ts <- ts(cust_2week, start=c(2011,1), end=c(2018,365), frequency=365)
cust.dec = decompose(sub1_ts)
plot(cust.dec)
Fitting SARIMA Model
Now we check the detrended and differenced time series to see which one is stationary
par(mfrow=2:1) # plot transformed data
tsplot(log(total_cust), main="log" )
tsplot(diff(total_cust), main="differenced" )
Differenced time series looks stationary.
tsplot(diff(log_total_cust))
summary(ur.ers(diff(log_total_cust)))
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3195 -0.0392 0.1196 0.2621 3.3528
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.50388 0.03496 -14.41 <2e-16 ***
## yd.diff.lag1 -0.63055 0.03396 -18.57 <2e-16 ***
## yd.diff.lag2 -0.57342 0.03166 -18.11 <2e-16 ***
## yd.diff.lag3 -0.41255 0.02679 -15.40 <2e-16 ***
## yd.diff.lag4 -0.16818 0.01830 -9.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4282 on 2911 degrees of freedom
## Multiple R-squared: 0.5772, Adjusted R-squared: 0.5764
## F-statistic: 794.7 on 5 and 2911 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -14.4112
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
adf.test(diff(log_total_cust))
## Warning in adf.test(diff(log_total_cust)): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(log_total_cust)
## Dickey-Fuller = -22.058, Lag order = 14, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(log_total_cust))
## Warning in pp.test(diff(log_total_cust)): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: diff(log_total_cust)
## Dickey-Fuller Z(alpha) = -2851.4, Truncation lag parameter = 9, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(log_total_cust))
## Warning in kpss.test(diff(log_total_cust)): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: diff(log_total_cust)
## KPSS Level = 0.028709, Truncation lag parameter = 9, p-value = 0.1
After 4 tests, I am confident that the differenced time series is stationary.
mod1 <- Arima(log_total_cust, order = c(0,1,0))
plot(mod1$residuals)
acf(mod1$residuals)
pacf(mod1$residuals)
mod1
## Series: log_total_cust
## ARIMA(0,1,0)
##
## sigma^2 = 0.1654: log likelihood = -1516.44
## AIC=3034.88 AICc=3034.88 BIC=3040.86
mod1 <- Arima(log_total_cust, order = c(1,1,0))
plot(mod1$residuals)
acf(mod1$residuals)
pacf(mod1$residuals)
mod1
## Series: log_total_cust
## ARIMA(1,1,0)
##
## Coefficients:
## ar1
## -0.3081
## s.e. 0.0176
##
## sigma^2 = 0.1497: log likelihood = -1370.88
## AIC=2745.76 AICc=2745.76 BIC=2757.71
mod1 <- Arima(log_total_cust, order = c(1,1,1))
plot(mod1$residuals)
acf(mod1$residuals)
pacf(mod1$residuals)
mod1
## Series: log_total_cust
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.3602 -0.9054
## s.e. 0.0208 0.0090
##
## sigma^2 = 0.1234: log likelihood = -1087.92
## AIC=2181.85 AICc=2181.85 BIC=2199.78
auto.arima(log_total_cust)
## Series: log_total_cust
## ARIMA(3,1,2)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2
## -0.511 0.3065 -0.0382 -0.0285 -0.7842
## s.e. 0.058 0.0323 0.0208 0.0552 0.0509
##
## sigma^2 = 0.1231: log likelihood = -1083.78
## AIC=2179.56 AICc=2179.59 BIC=2215.44
mod1 <- Arima(log_total_cust, order = c(3,1,2))
plot(mod1$residuals)
acf(mod1$residuals)
pacf(mod1$residuals)
coeftest(mod1)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 -0.511038 0.058015 -8.8087 < 2e-16 ***
## ar2 0.306507 0.032295 9.4908 < 2e-16 ***
## ar3 -0.038163 0.020764 -1.8379 0.06608 .
## ma1 -0.028493 0.055182 -0.5163 0.60561
## ma2 -0.784198 0.050940 -15.3946 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1
## Series: log_total_cust
## ARIMA(3,1,2)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2
## -0.511 0.3065 -0.0382 -0.0285 -0.7842
## s.e. 0.058 0.0323 0.0208 0.0552 0.0509
##
## sigma^2 = 0.1231: log likelihood = -1083.78
## AIC=2179.56 AICc=2179.59 BIC=2215.44
mod1 <- Arima(log_total_cust, order = c(1,1,1))
plot(mod1$residuals)
acf(mod1$residuals)
pacf(mod1$residuals)
coeftest(mod1)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.3601614 0.0207536 17.354 < 2.2e-16 ***
## ma1 -0.9053887 0.0089533 -101.123 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1
## Series: log_total_cust
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.3602 -0.9054
## s.e. 0.0208 0.0090
##
## sigma^2 = 0.1234: log likelihood = -1087.92
## AIC=2181.85 AICc=2181.85 BIC=2199.78
Best ARIMA model:
mod1 <- Arima(log_total_cust, order = c(1,1,1))
plot(mod1$residuals)
acf(mod1$residuals)
pacf(mod1$residuals)
coeftest(mod1)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.3601614 0.0207536 17.354 < 2.2e-16 ***
## ma1 -0.9053887 0.0089533 -101.123 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1
## Series: log_total_cust
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.3602 -0.9054
## s.e. 0.0208 0.0090
##
## sigma^2 = 0.1234: log likelihood = -1087.92
## AIC=2181.85 AICc=2181.85 BIC=2199.78
#stationary tests for residuals
adf.test(mod1$residuals)
## Warning in adf.test(mod1$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod1$residuals
## Dickey-Fuller = -14.095, Lag order = 14, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## Warning in pp.test(mod1$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod1$residuals
## Dickey-Fuller Z(alpha) = -2898.9, Truncation lag parameter = 9, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## Warning in kpss.test(mod1$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod1$residuals
## KPSS Level = 0.19141, Truncation lag parameter = 9, p-value = 0.1
summary(ur.ers(mod1$residuals))
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6054 -0.0820 0.0471 0.1573 1.2879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -1.03487 0.04154 -24.912 <2e-16 ***
## yd.diff.lag1 0.03902 0.03720 1.049 0.2943
## yd.diff.lag2 0.03323 0.03205 1.037 0.2999
## yd.diff.lag3 0.01562 0.02616 0.597 0.5505
## yd.diff.lag4 0.03633 0.01855 1.958 0.0503 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3511 on 2912 degrees of freedom
## Multiple R-squared: 0.4992, Adjusted R-squared: 0.4984
## F-statistic: 580.7 on 5 and 2912 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -24.9122
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
Model utility tests:
checkresiduals(mod1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)
## Q* = 15.324, df = 8, p-value = 0.05314
##
## Model df: 2. Total lags used: 10
tsdiag(mod1) # looks good
#mod1 = sarima(total_cust,p=1,d=1,q=1,P=3,D=0,Q=0,S=7)
densityplot(as.numeric(mod1$residuals)) # these look uniform
sarima(log_total_cust, 1,1,1)
## initial value -0.899661
## iter 2 value -0.953066
## iter 3 value -0.971487
## iter 4 value -0.981832
## iter 5 value -1.004571
## iter 6 value -1.020625
## iter 7 value -1.033720
## iter 8 value -1.037279
## iter 9 value -1.041903
## iter 10 value -1.045136
## iter 11 value -1.045558
## iter 12 value -1.045578
## iter 13 value -1.045589
## iter 14 value -1.045592
## iter 15 value -1.045592
## iter 16 value -1.045592
## iter 16 value -1.045592
## iter 16 value -1.045592
## final value -1.045592
## converged
## initial value -1.046514
## iter 2 value -1.046520
## iter 3 value -1.046521
## iter 4 value -1.046522
## iter 5 value -1.046522
## iter 5 value -1.046522
## iter 5 value -1.046522
## final value -1.046522
## converged
## $fit
##
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S),
## xreg = constant, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 constant
## 0.3603 -0.9055 4e-04
## s.e. 0.0208 0.0090 1e-03
##
## sigma^2 estimated as 0.1233: log likelihood = -1087.83, aic = 2183.66
##
## $degrees_of_freedom
## [1] 2918
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.3603 0.0208 17.3607 0.0000
## ma1 -0.9055 0.0090 -101.1686 0.0000
## constant 0.0004 0.0010 0.4335 0.6646
##
## $AIC
## [1] 0.7475719
##
## $AICc
## [1] 0.7475747
##
## $BIC
## [1] 0.7557605
predictions_sarima = predict(mod1,n.ahead=60)
predictions_sarima
## $pred
## Time Series:
## Start = 2923
## End = 2982
## Frequency = 1
## [1] 8.100830 8.215069 8.256213 8.271032 8.276369 8.278291 8.278984 8.279233
## [9] 8.279323 8.279355 8.279367 8.279371 8.279373 8.279373 8.279373 8.279373
## [17] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [25] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [33] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [41] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [49] 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373 8.279373
## [57] 8.279373 8.279373 8.279373 8.279373
##
## $se
## Time Series:
## Start = 2923
## End = 2982
## Frequency = 1
## [1] 0.3512246 0.3858388 0.3963692 0.4018129 0.4058315 0.4093753 0.4127392
## [8] 0.4160232 0.4192628 0.4224710 0.4256525 0.4288096 0.4319433 0.4350544
## [15] 0.4381433 0.4412106 0.4442567 0.4472820 0.4502870 0.4532721 0.4562377
## [22] 0.4591841 0.4621117 0.4650209 0.4679120 0.4707854 0.4736413 0.4764801
## [29] 0.4793021 0.4821076 0.4848968 0.4876701 0.4904277 0.4931699 0.4958969
## [36] 0.4986091 0.5013065 0.5039895 0.5066583 0.5093131 0.5119541 0.5145816
## [43] 0.5171958 0.5197968 0.5223848 0.5249601 0.5275228 0.5300731 0.5326112
## [50] 0.5351373 0.5376515 0.5401540 0.5426450 0.5451246 0.5475929 0.5500502
## [57] 0.5524965 0.5549321 0.5573570 0.5597714
Forecasting:
forecastArea <- forecast(exp(mod1$fitted), h = 365)
plot(forecastArea,lwd=2,col="purple", main="Forecasts from SARIMA(1,1,1)", xlab="Time", ylab="Number of customers")
legend("topleft", legend=c("Past", "Future"), col=c("Purple", "Blue"), lty=1:2, cex=0.8)
Xreg with temp_max, precipitation, and wind
Check NA values in columns temp_max
na_index = list(which(is.na(bike_share$temp_max), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)
precipitation:
na_index = list(which(is.na(bike_share$precip), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)
wind:
na_index = list(which(is.na(bike_share$wind), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)
We transform the temperature from Celsius to Farenheit to avoid NAs when using log transformation on negative values
temp_max = bike_share$temp_max * 1.8 + 32
na_index = list(which(is.na(log(temp_max)), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)
We transform the precipitation to increase by 1 so we can use log transformation
na_index = list(which(is.na(log(temp_max)), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)
hol = bike_share$holiday
# use is.na() to identify NA values in the list
na_values <- is.na(hol)
# use ifelse() to replace NA values with 0
hol1 <- ifelse(na_values, 0, hol) #for latter models
holiday = log(hol1 + 1)
temp_max = log(temp_max)
log_temp_max = log(temp_max)
precip = log(bike_share$precip + 1)
wind = log(bike_share$wind)
tsplot(temp_max)
tsplot(precip)
tsplot(wind)
plot(log_total_cust,temp_max)
plot(log_total_cust,precip)
plot(log_total_cust,wind)
na_index = list(which(is.na(precip), arr.ind=TRUE))
na_index
## [[1]]
## integer(0)
length(log_total_cust)
## [1] 2922
#na_or_inf = is.na(xreg) | is.infinite(xreg)
# prepare vector of our independent x-variables
xreg <- cbind(temp_max, precip, wind)
fit <- auto.arima(log_total_cust, xreg = xreg)
checkresiduals(fit)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(3,1,2) errors
## Q* = 19.44, df = 5, p-value = 0.001591
##
## Model df: 5. Total lags used: 10
sarima(log_total_cust, 3,1,2, xreg = xreg)
## initial value -0.928119
## iter 2 value -1.053179
## iter 3 value -1.090721
## iter 4 value -1.106148
## iter 5 value -1.117877
## iter 6 value -1.120666
## iter 7 value -1.125214
## iter 8 value -1.128461
## iter 9 value -1.130893
## iter 10 value -1.131619
## iter 11 value -1.132809
## iter 12 value -1.133803
## iter 13 value -1.136221
## iter 14 value -1.137416
## iter 15 value -1.138183
## iter 16 value -1.138604
## iter 17 value -1.138917
## iter 18 value -1.139127
## iter 19 value -1.139186
## iter 20 value -1.139265
## iter 21 value -1.139352
## iter 22 value -1.139393
## iter 23 value -1.139421
## iter 24 value -1.139459
## iter 25 value -1.139462
## iter 26 value -1.139464
## iter 27 value -1.139468
## iter 28 value -1.139475
## iter 29 value -1.139484
## iter 30 value -1.139488
## iter 31 value -1.139490
## iter 32 value -1.139491
## iter 33 value -1.139493
## iter 34 value -1.139496
## iter 35 value -1.139507
## iter 36 value -1.139508
## iter 37 value -1.139513
## iter 38 value -1.139518
## iter 39 value -1.139521
## iter 40 value -1.139522
## iter 41 value -1.139531
## iter 42 value -1.139537
## iter 43 value -1.139545
## iter 44 value -1.139553
## iter 45 value -1.139574
## iter 46 value -1.139650
## iter 47 value -1.139664
## iter 48 value -1.139695
## iter 49 value -1.139732
## iter 50 value -1.139833
## iter 51 value -1.139932
## iter 52 value -1.140018
## iter 53 value -1.140025
## iter 54 value -1.140036
## iter 55 value -1.140043
## iter 56 value -1.140059
## iter 57 value -1.140067
## iter 58 value -1.140081
## iter 59 value -1.140101
## iter 60 value -1.140121
## iter 61 value -1.140130
## iter 62 value -1.140133
## iter 63 value -1.140134
## iter 64 value -1.140134
## iter 65 value -1.140135
## iter 66 value -1.140135
## iter 67 value -1.140136
## iter 68 value -1.140137
## iter 69 value -1.140137
## iter 69 value -1.140137
## final value -1.140137
## converged
## initial value -1.139132
## iter 2 value -1.139134
## iter 3 value -1.139138
## iter 4 value -1.139138
## iter 5 value -1.139139
## iter 6 value -1.139141
## iter 7 value -1.139143
## iter 8 value -1.139144
## iter 9 value -1.139145
## iter 10 value -1.139145
## iter 11 value -1.139145
## iter 12 value -1.139146
## iter 13 value -1.139146
## iter 14 value -1.139146
## iter 14 value -1.139146
## iter 14 value -1.139146
## final value -1.139146
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xreg, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 temp_max precip wind
## -0.6848 0.2155 0.0266 -0.0608 -0.8048 0.8707 -0.0972 -0.1181
## s.e. 0.0677 0.0302 0.0223 0.0646 0.0610 0.0469 0.0066 0.0152
##
## sigma^2 estimated as 0.1024: log likelihood = -817.27, aic = 1652.55
##
## $degrees_of_freedom
## [1] 2913
##
## $ttable
## Estimate SE t.value p.value
## ar1 -0.6848 0.0677 -10.1183 0.0000
## ar2 0.2155 0.0302 7.1398 0.0000
## ar3 0.0266 0.0223 1.1898 0.2342
## ma1 -0.0608 0.0646 -0.9411 0.3467
## ma2 -0.8048 0.0610 -13.1964 0.0000
## temp_max 0.8707 0.0469 18.5500 0.0000
## precip -0.0972 0.0066 -14.6585 0.0000
## wind -0.1181 0.0152 -7.7902 0.0000
##
## $AIC
## [1] 0.565748
##
## $AICc
## [1] 0.5657649
##
## $BIC
## [1] 0.5841722
Spectrum
mvspec(log_total_cust, col=4, lwd=1)
mvspec(log_total_cust, col=4, lwd=1, log='y')
nonparametric spectral estimation procedure Smoothing -> Better CI
mvspec(log_total_cust, spans = 2, col=4, lwd=1)
mvspec(log_total_cust, spans = 2, col=4, lwd=1, log='y')
Fit an autoregressive spectral estimator to the sunspot data using
spec.ar(). Display the parametric spectral estimate curve and the
non-parametric one on the same graph, and write a sentence to compare
them.
Find the biggest frequency: (7, 8, 9)
cust_spec = mvspec(log_total_cust, spans = 2, col=4, lwd=1)
which(cust_spec$spec>50)
## [1] 8
which(cust_spec$spec>40)
## [1] 8 9
which(cust_spec$spec>30)
## [1] 7 8 9
# The important frequency is spot 8 and 9
cust_spec$spec[8]
## [1] 73.00253
cust_spec$spec[9]
## [1] 42.83057
# Frequencies
cust_spec$freq[8]
## [1] 0.002666667
cust_spec$freq[9]
## [1] 0.003
# Periods
1/cust_spec$freq[8]
## [1] 375
1/cust_spec$freq[9]
## [1] 333.3333
t = time(bike_share$date)
omega1 = cust_spec$freq[8]
# Let's see how much omega1 explains
Xcos = cos(2*pi*t*omega1)
Xsin = sin(2*pi*t*omega1)
modC = lm(log_total_cust ~ log(t) + Xcos+Xsin)
summary(modC) # adjusted R^2 of 0.276
##
## Call:
## lm(formula = log_total_cust ~ log(t) + Xcos + Xsin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2992 -0.1246 0.0559 0.2385 1.0542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.205848 0.054571 113.72 <2e-16 ***
## log(t) 0.368908 0.007739 47.67 <2e-16 ***
## Xcos -0.397353 0.010843 -36.65 <2e-16 ***
## Xsin 0.187456 0.010819 17.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4121 on 2918 degrees of freedom
## Multiple R-squared: 0.5788, Adjusted R-squared: 0.5784
## F-statistic: 1337 on 3 and 2918 DF, p-value: < 2.2e-16
plot(modC,which=1) # looks good
tsplot(modC$residuals) # looks good
histfit=ts(predict(modC),start=1)
preddf = cbind(log_total_cust, histfit)
plot(exp(preddf), plot.type="single", col = c("black","red"))
t = time(bike_share$date)
omega1 = cust_spec$freq[8]
omega2 = cust_spec$freq[9]
# Let's see how much omega1 and omega2 explains
Xcos = cos(2*pi*t*omega1)
Xsin = sin(2*pi*t*omega1)
Xcos2 = cos(2*pi*t*omega2)
Xsin2 = sin(2*pi*t*omega2)
modC = lm(log_total_cust ~ log(t) + Xcos+Xsin +Xcos2+Xsin2)
summary(modC)
##
## Call:
## lm(formula = log_total_cust ~ log(t) + Xcos + Xsin + Xcos2 +
## Xsin2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4430 -0.1350 0.0609 0.2312 0.9451
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.168969 0.053330 115.675 <2e-16 ***
## log(t) 0.374606 0.007564 49.528 <2e-16 ***
## Xcos -0.392065 0.010533 -37.222 <2e-16 ***
## Xsin 0.185559 0.010508 17.658 <2e-16 ***
## Xcos2 0.141027 0.010509 13.419 <2e-16 ***
## Xsin2 -0.012495 0.010513 -1.189 0.235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4 on 2916 degrees of freedom
## Multiple R-squared: 0.6034, Adjusted R-squared: 0.6027
## F-statistic: 887.4 on 5 and 2916 DF, p-value: < 2.2e-16
plot(modC,which=1)
tsplot(modC$residuals)
histfit=ts(predict(modC),start=1)
preddf = cbind(log_total_cust, histfit)
plot(exp(preddf), plot.type="single", col = c("black","red"))
Mixed Model:
t = time(bike_share$date)
omega1 = cust_spec$freq[8]
omega2 = cust_spec$freq[9]
omega3 = cust_spec$freq[7]
Xcos = cos(2*pi*t*omega1)
Xsin = sin(2*pi*t*omega1)
Xcos2 = cos(2*pi*t*omega2)
Xsin2 = sin(2*pi*t*omega2)
Xcos3 = cos(2*pi*t*omega3)
Xsin3 = sin(2*pi*t*omega3)
modC = lm(log_total_cust ~ log(t) + Xcos+Xsin +Xcos2+temp_max+ precip + wind + holiday)
summary(modC)
##
## Call:
## lm(formula = log_total_cust ~ log(t) + Xcos + Xsin + Xcos2 +
## temp_max + precip + wind + holiday)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9342 -0.1298 0.0335 0.1934 0.9284
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.608141 0.152241 17.132 < 2e-16 ***
## log(t) 0.366204 0.006341 57.748 < 2e-16 ***
## Xcos -0.100327 0.014559 -6.891 6.77e-12 ***
## Xsin 0.105084 0.009757 10.771 < 2e-16 ***
## Xcos2 0.057372 0.009357 6.132 9.87e-10 ***
## temp_max 0.921385 0.035836 25.711 < 2e-16 ***
## precip -0.127907 0.006309 -20.275 < 2e-16 ***
## wind -0.086461 0.015245 -5.671 1.56e-08 ***
## holiday -0.391616 0.052179 -7.505 8.10e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3352 on 2913 degrees of freedom
## Multiple R-squared: 0.7219, Adjusted R-squared: 0.7211
## F-statistic: 945 on 8 and 2913 DF, p-value: < 2.2e-16
plot(modC,which=1)
tsplot(modC$residuals)
histfit=ts(predict(modC),start=1)
preddf = cbind(log_total_cust, histfit)
plot(exp(preddf), plot.type="single", col = c("black","red"))